In [1]:
import pandas as pd
import numpy as np
In [2]:
congress_words = pd.read_csv("../data/legislators.csv")
congress_words = pd.DataFrame(congress_words)
congress_words.head()
Out[2]:
In [3]:
congress_words.columns.tolist()
Out[3]:
In [4]:
l_bioGuides = congress_words.bioguide_id.tolist()
print l_bioGuides[:10]
In [5]:
from urllib2 import Request, urlopen
import json
from pandas.io.json import json_normalize
def requestWords( id ):
id = str(id)
url = "http://capitolwords.org/api/1/phrases.json?entity_type=legislator&entity_value="+id+"&apikey=0bf8e7eb6ce146f48217bfee767c998d"
request=Request(url)
response = urlopen(request)
contents = response.read()
len(contents)
if len(contents) > 2:
data = json.loads(contents)
words = json_normalize(data)
list_of_words = words.ngram.tolist()
string_of_words ="|".join(list_of_words)
return string_of_words
else:
return np.nan
In [6]:
congress_words['favorite_words'] = congress_words.apply(lambda row: requestWords(row['bioguide_id']),axis=1)
In [7]:
congress_words.favorite_words.head(20)
Out[7]:
In [8]:
congress_words = congress_words[congress_words.favorite_words.notnull()]
In [9]:
print "Number of legislators with word record:", len(congress_words.favorite_words)
In [10]:
favorite_words = congress_words.favorite_words.str.get_dummies(sep = "|")
print favorite_words.head(3)
favorite_words.columns[:100]
Out[10]:
In [11]:
favorite_words.shape
Out[11]:
In [12]:
favorite_words.columns[760:800]
Out[12]:
In [13]:
word_list = favorite_words.columns.tolist()
print "Some of the words in it", word_list[800:900]
In [46]:
def word_finder(list,start):
for index, element in enumerate(list, start):
if element[0]!="a":
pass
else:
first = index
break
return first
x = word_list
print word_finder(x,0)
def wordlist(dataFrame):
list = dataFrame.columns.tolist()
for index, element in enumerate(list):
if element[0]!="a":
pass
else:
first = index
break
return list[first:]
In [336]:
print word_list[776]
#del favorite_words['a']
word_list = favorite_words.columns.tolist()
In [16]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=1)
vectorizer
Out[16]:
In [338]:
corpus = favorite_words.columns.tolist()
X = vectorizer.fit_transform(corpus)
len(corpus)
Out[338]:
In [322]:
analyze = vectorizer.build_analyzer()
print analyze("economy a this")
vectorizer.get_feature_names()[910:920]
Out[322]:
In [323]:
vectorizer.vocabulary_.get('document') #not seen in the training corpus will be completely ignored in future calls to the transform method
Out[323]:
In [328]:
unrelated = vectorizer.transform(['Something completely unrelated']).toarray()
len(unrelated[0])
Out[328]:
In [313]:
def new_text_vector(string):
array = analyze(string)
#array = vectorizer.transform([string]).toarray()
return array
new_text_vector("lalalalalalalala Some piece of text I want to classify for being as rejecting discrimination")
Out[313]:
In [22]:
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer()
transformer
Out[22]:
In [23]:
tfidf = transformer.fit_transform(favorite_words)
tfidf_array = tfidf.toarray()
tfidf_array.shape
tfidf_array[20].max()
transformer.idf_
Out[23]:
In [464]:
analyze = vectorizer.build_analyzer()
analyze("iraq this a unanana")
v = CountVectorizer().fit("iraq this a unanana").vocabulary_
In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df=1)
vectorizer.fit_transform(corpus)
vec_idf = vectorizer.idf_
print len(vec_idf)
In [25]:
words_weight = pd.DataFrame(tfidf_array, index=congress_words.index , columns=corpus)
print congress_words.index
print words_weight.index
In [26]:
capitol_words = congress_words.merge(words_weight, right_index=True, left_index=True)
capitol_words.head()
Out[26]:
In [27]:
word_column_names_capitol = capitol_words.columns.tolist()[word_finder(capitol_words,0):]
capitol_words[word_column_names_capitol].head()
Out[27]:
In [28]:
capitol_words[word_column_names_capitol].sum().max()
Out[28]:
In [29]:
word_frequencies = (capitol_words[word_column_names_capitol]>0).astype(int).sum(axis=0).astype(float)/capitol_words.shape[0]
most_frequent_words = word_frequencies[word_frequencies>.95].index
most_frequent_words
Out[29]:
In [30]:
word_frequencies = (capitol_words[word_column_names_capitol]>0).astype(int).sum(axis=0)
word_frequencies.max()
Out[30]:
In [31]:
capitol_words.party_x.unique()
party_mask = capitol_words.party_x!="I"
two_party_words = capitol_words[party_mask]
print "Entries before getting rid of independents:", capitol_words.shape[0]
print "Entries after getting rid of independents:", two_party_words.shape[0]
print "Number of independents:", (capitol_words.shape[0])-(two_party_words.shape[0])
In [32]:
party_dummies = pd.get_dummies(capitol_words.party_x).astype(int)
party_dummies = party_dummies[["R"]]
party_dummies.head()
capitol_words = party_dummies.merge(capitol_words, right_index=True, left_index=True)
capitol_words.head()
Out[32]:
In [33]:
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from scipy import stats
In [34]:
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt
In [35]:
X_words = capitol_words[word_column_names_capitol]
y_words = capitol_words["R"]
X_train,X_test,y_train,y_test = train_test_split(X_words,y_words,test_size=0.4)
from sklearn.tree import DecisionTreeClassifier
words_tree = DecisionTreeClassifier(max_depth=3, random_state=1)
words_tree.fit(X_train, y_train)
Out[35]:
In [36]:
words_tree.feature_importances_
features = pd.DataFrame({'feature':word_column_names_capitol, 'importance':words_tree.feature_importances_}).sort_values(by='importance',ascending=False)
In [37]:
features.head()
Out[37]:
In [38]:
def my_mask(df,column,condition,value):
new_data = []
if condition == "==":
new_data = df[df[column] == value]
elif condition == "<=":
new_data = df[df[column] <= value]
elif condition == "!=":
new_data = df[df[column] != value]
elif condition == ">=":
new_data = df[df[column] >= value]
elif condition == ">":
new_data = df[df[column] > value]
elif condition == "<":
new_data = df[df[column] < value]
else:
print "arguments needed-column,condition,value-:"
return new_data
def subset(df,column):
dict = {}
subs = df[column].unique()
for element in subs:
dict[element] = my_mask(df,column,"==",element)
print "New available dictionary of dataframes is:\n subsets_of ",subs
return dict
def clean_sparse_irrelevant(pd):
cols = pd.columns
deleted=0
for c in cols:
x=pd[c]
if x.dtype=="float64":
if x.sum()==0:
del pd[c]
deleted += 1
print "DELETED:",deleted
return pd
In [39]:
states = subset(capitol_words,"state_x")
states['AK'].head()
parties = subset(capitol_words, "party_x")
parties['D'].head()
Out[39]:
In [69]:
clean_sparse_irrelevant(states['AK'])
AK = states["AK"]
AK_words = wordlist(states['AK'])
AK[AK_words].sum()
AK_word_count = pd.DataFrame({'feature':AK_words, 'words':AK[AK_words].sum()}).sort_values(by='words',ascending=False)
AK_word_count.head(20)
Out[69]:
with sklearn.preprocessing package
centering sparse data would destroy the sparseness structure in the data, but MaxAbsScaler and maxabs_scale were specifically designed for scaling sparse data, specially if the features are in different scales. scale and StandardScaler can accept scipy.sparse matrices as input, as long as with_centering=False More about this
I will normalize on one small subset of my data just to see what the results would be, how the values would change.
In [70]:
AK
Out[70]:
In [75]:
from sklearn.preprocessing import maxabs_scale
print maxabs_scale(AK.ix[:,43:], axis=0, copy=False)
AK.ix[:,43:] = maxabs_scale(AK.ix[:,43:], axis=0, copy=False)
print AK.ix[:,43:].head()
In [76]:
AK
Out[76]:
In [77]:
def word_maxabsscaler(dataFrame,index):
dataFrame.ix[:,word_finder(dataFrame,index):] = maxabs_scale(dataFrame.ix[:,word_finder(dataFrame,index):], axis=0, copy=False)
In [78]:
word_maxabsscaler(capitol_words,30)
clean_sparse_irrelevant(capitol_words)
capitol_words.head()
Out[78]:
Just to compare result between these models in this particular data set. This is an example of when it's a good idea to reduce the number of columns in the data set. There are more than 14 000 columns (it was the resut of getting the words that were said the must as dummies and then getting the td-idf count of them) So too many columns are being used to predict the target variable, that is Republican or Democrat. One of the risks of these techniques is overfitting the model
In [80]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder, PolynomialFeatures, StandardScaler
from sklearn.linear_model import Lasso, Ridge, LinearRegression, LogisticRegression, ElasticNet
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.cross_validation import cross_val_score, train_test_split
import scipy.stats as stats
%matplotlib inline
import seaborn as sns
In [81]:
capitol_words.head()
Out[81]:
After normalization, some words had a global weight that was very small in a td-idf matrix count, so their column.sum() was cero, I will not feed that to my model because a colum n filled with 0 will not add much variance in a spacer matrix. Also at index 30 is where I the sparse matrix got attached to the original data set.
In [83]:
capitol_words[word_column_names_capitol].head()
Out[83]:
In [84]:
global_correlations = capitol_words.ix[:,836:].corr()
global_correlations.head()
Out[84]:
In [ ]:
sns.plt.figure(figsize=(24,20))
sns.heatmap(capitol_words.ix[:,836:].transpose().corr().values)
In [87]:
pca = PCA()
transformed_pca_x = pca.fit_transform(capitol_words[word_column_names_capitol])
component_names = ["component_"+str(comp) for comp in range(1, len(pca.explained_variance_)+1)]
transformed_pca_x = pd.DataFrame(transformed_pca_x,columns=component_names)
print "CCOMPONENT MATRIX:"
transformed_pca_x.head()
Out[87]:
In [91]:
component_matrix = pd.DataFrame(pca.components_,index=component_names, columns=word_column_names_capitol)
component_matrix["explained_variance_ratio"] = pca.explained_variance_ratio_
component_matrix["eigenvalue"] = pca.explained_variance_
The problem with this is that PCA expects features with little to no correlation, and in this case, with words if I were to build a model that was based on eliminating similar words or correlated words, this would only acomplish the task of being overfitted and it would not do well at all for predicting a real example. Let's say one of the components was based on the word "small" and "small" is correlated with "little" but I just deleted little. Unless I have another way to capture semantic similarity I can't get rid of those words just yet.
In [92]:
component_matrix.head()
Out[92]:
In [90]:
X = transformed_pca_x.ix[:,:500]
y = capitol_words["R"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=1)
lr = LogisticRegression(C=1e9, penalty='l1')
lr.fit(X_train,y_train)
y_test_pred = lr.predict(X_test)
print "Test set accuracy of LR model: ",metrics.accuracy_score(y_test, y_test_pred)
In [93]:
from sklearn.cross_validation import KFold, train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.preprocessing import LabelEncoder, StandardScaler, PolynomialFeatures
from sklearn import metrics
import scipy.stats as stats
In [96]:
X = capitol_words[word_column_names_capitol]
y = capitol_words["R"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=1)
lr = LogisticRegression(C=1e9, penalty='l2')
lr.fit(X_train,y_train)
y_test_pred = lr.predict(X_test)
print "Test set accuracy of LR model: ",metrics.accuracy_score(y_test, y_test_pred)
In [98]:
print "Null accuracy on the test set: ",y_test.mean()
In [99]:
from sklearn.dummy import DummyClassifier
dumb_model = DummyClassifier(strategy='most_frequent')
dumb_model.fit(X_train, y_train)
y_dumb_class = dumb_model.predict(X_test)
print "Most frequent class dummy classifier test accuracy: ",metrics.accuracy_score(y_test, y_dumb_class)
In [100]:
dumb_model = DummyClassifier(strategy='most_frequent')
dummy_scores = cross_val_score(dumb_model, X, y, cv=30)
real_scores = cross_val_score(LogisticRegression(),X , y,cv=30)
sns.plt.hist(dummy_scores)
sns.plt.hist(real_scores)
#we could use a cv=Startifield Kfold for when you have really unbalanced
#real_scores = cross_val_score(LogisticRegression(),X , y,cv=30)
print np.mean(dummy_scores)
print np.mean(real_scores)
print np.std(real_scores)
In [ ]:
In [101]:
cm = metrics.confusion_matrix(y_test, y_test_pred)
print cm
sns.heatmap(cm)
Out[101]:
In [102]:
print "Sensitivity/Recall (TPR): ",metrics.recall_score(y_test,y_test_pred)
print "Precision (PPV): ", metrics.precision_score(y_test,y_test_pred)
print "NPV: ", cm[0,0] / float(cm[0,0]+cm[1,0])
print "Accuracy: ", metrics.accuracy_score(y_test,y_test_pred)
print "F1:", metrics.f1_score(y_test,y_test_pred)
In [103]:
print "Classification Report:\n", metrics.classification_report(y_test,y_test_pred)
In [104]:
#lr probabilities per category for first five samples
predicted_probs_lr = lr.predict_proba(X_test).round(3)
predictions_lr = lr.predict(X_test)
print "Logistic Regression predicted probabilities for first five samples in test set:\n",predicted_probs_lr[:5]
print "Logistic Regression predictions for first five samples in test set:\n",predictions_lr[:5]
y_test_lr_df = pd.DataFrame(
np.concatenate((
predicted_probs_lr,predictions_lr.reshape((predictions_lr.shape[0],-1)),
y_test.reshape((y_test.shape[0],-1))),axis=1
),
columns = ["class_0","class_1","predicted","actual"])
y_test_lr_df.head()
Out[104]:
In [108]:
bad_y_class_0 = y_test_lr_df[np.logical_and(y_test_lr_df.class_0>.9, y_test_lr_df.actual==1.0)]
print bad_y_class_0
bad_y_class_1 = y_test_lr_df[np.logical_and(y_test_lr_df.class_1>.9, y_test_lr_df.actual==0.0)]
print bad_y_class_1
In [109]:
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train,y_train)
predicted_probs_rf = rf.predict_proba(X_test)
predictions_rf = rf.predict(X_test)
y_test_rf_df = pd.DataFrame(
np.concatenate((
predicted_probs_rf,predictions_rf.reshape((predictions_rf.shape[0],-1)),
y_test.reshape((y_test.shape[0],-1))),axis=1
),
columns = ["class_0","class_1","predicted","actual"])
y_test_rf_df.head()
Out[109]:
In [111]:
#generate lr model false positive and true positive rates
fpr_lr, tpr_lr, thresholds_lr = metrics.roc_curve(y_test, predicted_probs_lr[:,1])
#generate same for random forest model
fpr_rf, tpr_rf, thresholds_rf = metrics.roc_curve(y_test, predicted_probs_rf[:,1])
# plot LR and RF model ROC curves
sns.plt.plot(fpr_lr, tpr_lr,label="lr")
sns.plt.plot(fpr_rf, tpr_rf,label="rf")
sns.plt.xlim([0, 1])
sns.plt.ylim([0, 1.05])
sns.plt.legend(loc="lower right")
sns.plt.xlabel('False Positive Rate (1 - Specificity)')
sns.plt.ylabel('True Positive Rate (Sensitivity)')
Out[111]:
In [113]:
# calculate AUC for lr and rf
print "LR model AUC: ",metrics.roc_auc_score(y_test, predicted_probs_lr[:,1])
print "RF model AUC: ",metrics.roc_auc_score(y_test, predicted_probs_rf[:,1])
In [114]:
# plot LR and RF model ROC curves
sns.plt.plot(fpr_lr, tpr_lr,label="lr")
sns.plt.plot(fpr_lr,thresholds_lr, label="lr_thresh")
sns.plt.xlim([0, 1])
sns.plt.ylim([0, 1.05])
sns.plt.legend(loc="center")
sns.plt.xlabel('False Positive Rate (1 - Specificity)')
sns.plt.ylabel('True Positive Rate (Sensitivity) or Class 1 Threshold Probability')
Out[114]:
In [118]:
y_test_lr_df["predicted_072"] = (y_test_lr_df.class_1 > 0.72).astype(float)
print y_test_lr_df.head()
print "Confusion matrix at original 0.5 threshold:\n",metrics.confusion_matrix(y_test_lr_df.actual,
y_test_lr_df.predicted),"\n"
print "Classification Report at original 0.5 threshold:\n", metrics.classification_report(y_test_lr_df.actual,
y_test_lr_df.predicted),"\n"
print "Confusion matrix at 0.72 threshold:\n",metrics.confusion_matrix(y_test_lr_df.actual,
y_test_lr_df.predicted_072),"\n"
print "Classification Report at 0.72 threshold:\n", metrics.classification_report(y_test_lr_df.actual,
y_test_lr_df.predicted_072)
In [119]:
# calculate AUC using y_pred_class (producing incorrect results)
print "Wrong way to calculate LR model AUC: ",metrics.roc_auc_score(y_test, predictions_lr)
print "Wrong way to calculate RF model AUC: ",metrics.roc_auc_score(y_test, predictions_rf)
In [ ]:
In [120]:
# histogram of predicted probabilities grouped by actual response value for LR
y_test_lr_df.class_1.hist(by= y_test_lr_df.actual, sharex=True, sharey=True)
#same for RF
y_test_rf_df.class_1.hist(by= y_test_rf_df.actual, sharex=True, sharey=True)
Out[120]:
In [121]:
#convert outcome into binary 0/1 attribute
le = LabelEncoder()
#create train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=1)
#create logistic regression object
lr = LogisticRegression()
lr.fit(X_train,y_train)
y_test_pred = lr.predict(X_test)
print "Test set accuracy of default 0.5 threshold LR model: ",metrics.accuracy_score(y_test, y_test_pred)
In [122]:
# calculate predicted probabilities for class 1
y_pred_prob1 = lr.predict_proba(X_test)[:, 1]
# show predicted probabilities in a histogram
sns.plt.hist(y_pred_prob1)
Out[122]:
In [123]:
# calculate AUC
metrics.roc_auc_score(y_test, y_pred_prob1)
Out[123]:
In [124]:
# plot ROC curve
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_prob1)
sns.plt.plot(fpr, tpr)
sns.plt.xlim([0, 1])
sns.plt.ylim([0, 1.05])
sns.plt.xlabel('False Positive Rate (1 - Specificity)')
sns.plt.ylabel('True Positive Rate (Sensitivity)')
Out[124]:
In [177]:
coef = lr.coef_
word_column_names_capitolcoeficient_weight = pd.DataFrame({'coeficient':coef[0], 'words':word_column_names_capitol}).sort_values(by='coeficient',ascending=False)
In [202]:
my_mask(coeficient_weight,"coeficient","<=",0.2).shape[0]
my_mask(coeficient_weight,"coeficient",">=",0.27)
Out[202]:
In [151]:
#vertebral_data.outcome.value_counts
capitol_words.party_x.value_counts()`
Out[151]:
In [198]:
sns.pairplot(capitol_words,x_vars=["requesting","entity","taxes"],y_vars="R", size=6, aspect=0.8)
Out[198]:
In [210]:
outcome_pred_class_log = lr.predict(X)
outcome_pred_class_log.sort()
In [211]:
# plot the class predictions
capitol_words.sort('R', inplace=True)
plt.scatter(capitol_words.requesting, capitol_words.R)
plt.plot(capitol_words.requesting, outcome_pred_class_log, color='red')
Out[211]:
In [226]:
my_mask(coeficient_weight,"coeficient","<=",-.3)
Out[226]:
In [445]:
from sklearn.feature_extraction.text import TfidfTransformer
lala = "It isn’t enough to merely downsize government, having a smaller version of the same failed systems. We must do things in a dramatically different way by reversing the undermining of federalism and the centralizing of power in Washington. We look to the example set by Republican Governors and legislators all across the nation. Their leadership in reforming and reengineering government closest to the people vindicates the role of the States as the laboratories of democracy.Our approach, like theirs, is two-fold. We look to government – local, State, and federal – for the things government must do, but we believe those duties can be carried out more efficiently and at less cost. For all other activities, we look to the private sector; for the American people’s resourcefulness, productivity, innovation, fiscal responsibility, and citizen-leadership have always been the true foundation of our national greatness For much of the last century, an opposing view has dominated public policy where we have witnessed the expansion, centralization, and bureaucracy in an entitlement society. Government has lumbered on, stifling innovation, with no incentive for fundamental change, through antiquated programs begun generations ago and now ill-suited to present needs and future requirements. As a result, today’s taxpayers – and future generations – face massive indebtedness, while Congressional Democrats and the current Administration block every attempt to turn things around. This man-made log-jam – the so-called stalemate in Washington – particularly affects the government’s three largest programs, which have become central to the lives of untold millions of Americans: Medicare, Medicaid, and Social Security committed to saving Medicare and Medicaid. Unless the programs’ fiscal ship is righted, the individuals hurt the first and the worst will be those who depend on them the most. We will save Medicare by modernizing it, by empowering its participants, and by putting it on a secure financial footing. This will be an enormous undertaking, and it should be a non-partisan one. We welcome to the effort all who sincerely want to ensure the future for our seniors and the poor. Republicans are determined to achieve that goal with a candid and honest presentation of Despite the enormous differences between Medicare and Medicaid, the two programs share the same fiscal outlook: their current courses cannot be sustained. Medicare has grown from more than 20 million enrolled in 1970 to more than 47 million enrolled today, with a projected total of 80 million in 2030. Medicaid counted almost 30 million enrollees in 1990, has about 54 million now, and under Obamacare would include an additional 11 million. Medicare spent more than $520 billion in 2010 and has close to $37 trillion in unfunded obligations, while total Medicaid spending will more than double by 2019. In many States, Medicaid’s mandates and inflexible bureaucracy have become a budgetary black hole, growing faster than most other budget lines and devouring funding for many other essential governmental functions the problem and its solutions to the American people We are the party of government reform. At a time when the federal government has become bloated, antiquated and unresponsive to taxpayers, it is our intention not only to improve management and provide better services, but also to rethink and restructure government to bring it into the twenty-first century. Government reform requires constant vigilance and effort because government by its nature tends to expand in both size and scope. Our goal is not just less spending in Washington but something far more important for the future of our nation: protecting the constitutional rights of citizens, The problem goes beyond finances. Poor quality healthcare is the most expensive type of care because it prolongs affliction and leads to ever more complications. Even expensive prevention is preferable to more costly treatment later on. When approximately 80 percent of healthcare costs are related to lifestyle -smoking, obesity, substance abuse-far greater emphasis has to be put upon personal responsibility for health maintenance. Our goal for both Medicare and Medicaid must be to assure that every participant receives the amount of care they need at the time they need it, whether for an expectant mother and her baby or for someone in the last moments of life. The proper purpose of regulation is to set forth clear rules of the road for the citizens, so that business owners and workers can understand in advance what they need to do, or not do, to augment the possibilities for success within the confines of the law. Regulations must be drafted and implemented to balance legitimate public safety or consumer protection goals and job creation. Constructive regulation should be a helpful guide, not a punitive threat. Worst of all, over-regulation is a stealth tax on everyone as the costs of compliance with the whims of federal agencies are passed along to the consumers at the cost of $1.75 trillion a year. Many regulations are necessary, like those which ensure the safety of food and medicine, especially from overseas. But no peril justifies the regulatory impact of Obamacare on the practice of medicine, the Dodd-Frank Act on financial services, or the EPA’s and OSHA’s overreaching regulation agenda. A Republican Congress and President will repeal the first and second, and rein in the third. We support a sunset requirement to force reconsideration of out-of-date regulations, and we endorse pending legislation to require congressional approval for all new major and costly regulations Absent reforms, these two programs are headed for bankruptcy that will endanger care for seniors and the poor sustainable prosperity, and strengthening the American family I trust Iowans, Granite staters (ph), people in South Carolina, people in Nevada, to start this process out. I kind of miss Donald Trump. He was a little teddy bear to me We always had such a loving relationship in these debates and in between and the tweets. I kind of miss him. I wish he was here. Everybody else was in the witness protection program when I went after him on behalf of what the Republican cause should be: conservative principles, believing in limited government, believing in accountability. Leading by fixing the things that are broken. Look, I am in the establishment because my dad, the greatest man alive was president of the United States and my brother, who I adore as well as fantastic brother was president. Fine, I'll take it. I guess I'm part of the establishment Barbara Bush my mom I'll take that, too But this election is not about our pedigree, this is an election about people that are really hurting. We need a leader that will fix things and have a proven record to do it. And we need someone who will take on Hillary Clinton in November. Someone who has a proven record, who has been tested, who is totally transparent. I released 34 years of tax returns...and 300,000 e-mails in my government record. To get the information from Hillary Clinton, you need to get a subpoena from the FBI. Senator Christie, you began this campaign touting your record as a Republican from a blue state who knows how to get things done and reach across the aisle. However, many Republicans feel that reaching across the aisle and getting things done isn't great if you get the wrong things done. And they prefer to stand on principle rather than compromise. Why are they wrong and you're right? They're not wrong. But what's wrong is your premise in the question. You can do both. There is no reason why you can't stand for principles, go and fight for them and be able also, to have to get things done in government.You know, what people are frustrated about in Washington, D.C.., and I know the folks out there tonight are incredibly frustrated because what they see is a government that doesn't work for them. You know, for the 45-year-old construction worker out there, who is having a hard time making things meet.He's lost $4,000 in the last seven years in his income because of this administration. He doesn't want to hear the talk about politics Megyn and who is establishment and who is grassroots. And who's compromised and who is principled. What he wants is something to get done.And that's the difference between being a governor and having done that for the last six years in New Jersey and being someone who has never had to be responsible for any of those decisions. Barack Obama was never responsible for those decisions.Hillary Clinton has never been responsible for those kind of decisions where they were held accountable. I've been held accountable for six years as the governor of New Jersey and with a Democratic legislature, I've gotten conservative things done. That's exactly what I'll do as president of the United States.Senator Paul, you are definitely not in the establishment category But at the beginning of this campaign, you said you were your own man when asked about your father, former Texas Congressman and three-time presidential candidate Ron Paul"
features = vectorizer.get_feature_names()
vectorized = pd.DataFrame(vectorizer.transform(['lala']).toarray(),columns=features)
vectorized.head()
print vectorizer.transform(['lala']).toarray()
features
new_words = pd.DataFrame({'words':features, 'counts':vectorizer.transform(['lala']).toarray()[0]}).sort_values(by='counts',ascending=False)
new_words
Out[445]:
In [431]:
vectorizer.transform(['Something completely unrelated']).toarray()
#vectorizer.vocabulary_.get('document')
vectorizer.transform(['Something completely unrelated']).toarray()
def reporter(list):
values = []
not_av = []
for word in list:
try:
val = vectorized[word][0]
values.append(val)
except KeyError:
values.append(0.0)
return values
values = reporter(word_column_names_capitol)
print len(values)
print values
#X has 1 features per sample; expecting 13643
print len(lr.coef_[0])
val = vectorized["iraq"][0]
print val
print "class", lr.predict(values)
print "probability", lr.predict_proba(values)
In [499]:
new_test = "It isn’t enough to merely downsize government, having a smaller version of the same failed systems. We must do things in a dramatically different way by reversing the undermining of federalism and the centralizing of power in Washington. We look to the example set by Republican Governors and legislators all across the nation. Their leadership in reforming and reengineering government closest to the people vindicates the role of the States as the laboratories of democracy.Our approach, like theirs, is two-fold. We look to government – local, State, and federal – for the things government must do, but we believe those duties can be carried out more efficiently and at less cost. For all other activities, we look to the private sector; for the American people’s resourcefulness, productivity, innovation, fiscal responsibility, and citizen-leadership have always been the true foundation of our national greatness For much of the last century, an opposing view has dominated public policy where we have witnessed the expansion, centralization, and bureaucracy in an entitlement society. Government has lumbered on, stifling innovation, with no incentive for fundamental change, through antiquated programs begun generations ago and now ill-suited to present needs and future requirements. As a result, today’s taxpayers – and future generations – face massive indebtedness, while Congressional Democrats and the current Administration block every attempt to turn things around. This man-made log-jam – the so-called stalemate in Washington – particularly affects the government’s three largest programs, which have become central to the lives of untold millions of Americans: Medicare, Medicaid, and Social Security committed to saving Medicare and Medicaid. Unless the programs’ fiscal ship is righted, the individuals hurt the first and the worst will be those who depend on them the most. We will save Medicare by modernizing it, by empowering its participants, and by putting it on a secure financial footing. This will be an enormous undertaking, and it should be a non-partisan one. We welcome to the effort all who sincerely want to ensure the future for our seniors and the poor. Republicans are determined to achieve that goal with a candid and honest presentation of Despite the enormous differences between Medicare and Medicaid, the two programs share the same fiscal outlook: their current courses cannot be sustained. Medicare has grown from more than 20 million enrolled in 1970 to more than 47 million enrolled today, with a projected total of 80 million in 2030. Medicaid counted almost 30 million enrollees in 1990, has about 54 million now, and under Obamacare would include an additional 11 million. Medicare spent more than $520 billion in 2010 and has close to $37 trillion in unfunded obligations, while total Medicaid spending will more than double by 2019. In many States, Medicaid’s mandates and inflexible bureaucracy have become a budgetary black hole, growing faster than most other budget lines and devouring funding for many other essential governmental functions the problem and its solutions to the American people We are the party of government reform. At a time when the federal government has become bloated, antiquated and unresponsive to taxpayers, it is our intention not only to improve management and provide better services, but also to rethink and restructure government to bring it into the twenty-first century. Government reform requires constant vigilance and effort because government by its nature tends to expand in both size and scope. Our goal is not just less spending in Washington but something far more important for the future of our nation: protecting the constitutional rights of citizens, The problem goes beyond finances. Poor quality healthcare is the most expensive type of care because it prolongs affliction and leads to ever more complications. Even expensive prevention is preferable to more costly treatment later on. When approximately 80 percent of healthcare costs are related to lifestyle -smoking, obesity, substance abuse-far greater emphasis has to be put upon personal responsibility for health maintenance. Our goal for both Medicare and Medicaid must be to assure that every participant receives the amount of care they need at the time they need it, whether for an expectant mother and her baby or for someone in the last moments of life. The proper purpose of regulation is to set forth clear rules of the road for the citizens, so that business owners and workers can understand in advance what they need to do, or not do, to augment the possibilities for success within the confines of the law. Regulations must be drafted and implemented to balance legitimate public safety or consumer protection goals and job creation. Constructive regulation should be a helpful guide, not a punitive threat. Worst of all, over-regulation is a stealth tax on everyone as the costs of compliance with the whims of federal agencies are passed along to the consumers at the cost of $1.75 trillion a year. Many regulations are necessary, like those which ensure the safety of food and medicine, especially from overseas. But no peril justifies the regulatory impact of Obamacare on the practice of medicine, the Dodd-Frank Act on financial services, or the EPA’s and OSHA’s overreaching regulation agenda. A Republican Congress and President will repeal the first and second, and rein in the third. We support a sunset requirement to force reconsideration of out-of-date regulations, and we endorse pending legislation to require congressional approval for all new major and costly regulations Absent reforms, these two programs are headed for bankruptcy that will endanger care for seniors and the poor sustainable prosperity, and strengthening the American family I trust Iowans, Granite staters (ph), people in South Carolina, people in Nevada, to start this process out. I kind of miss Donald Trump. He was a little teddy bear to me We always had such a loving relationship in these debates and in between and the tweets. I kind of miss him. I wish he was here. Everybody else was in the witness protection program when I went after him on behalf of what the Republican cause should be: conservative principles, believing in limited government, believing in accountability. Leading by fixing the things that are broken. Look, I am in the establishment because my dad, the greatest man alive was president of the United States and my brother, who I adore as well as fantastic brother was president. Fine, I'll take it. I guess I'm part of the establishment Barbara Bush my mom I'll take that, too But this election is not about our pedigree, this is an election about people that are really hurting. We need a leader that will fix things and have a proven record to do it. And we need someone who will take on Hillary Clinton in November. Someone who has a proven record, who has been tested, who is totally transparent. I released 34 years of tax returns...and 300,000 e-mails in my government record. To get the information from Hillary Clinton, you need to get a subpoena from the FBI. Senator Christie, you began this campaign touting your record as a Republican from a blue state who knows how to get things done and reach across the aisle. However, many Republicans feel that reaching across the aisle and getting things done isn't great if you get the wrong things done. And they prefer to stand on principle rather than compromise. Why are they wrong and you're right? They're not wrong. But what's wrong is your premise in the question. You can do both. There is no reason why you can't stand for principles, go and fight for them and be able also, to have to get things done in government.You know, what people are frustrated about in Washington, D.C.., and I know the folks out there tonight are incredibly frustrated because what they see is a government that doesn't work for them. You know, for the 45-year-old construction worker out there, who is having a hard time making things meet.He's lost $4,000 in the last seven years in his income because of this administration. He doesn't want to hear the talk about politics Megyn and who is establishment and who is grassroots. And who's compromised and who is principled. What he wants is something to get done.And that's the difference between being a governor and having done that for the last six years in New Jersey and being someone who has never had to be responsible for any of those decisions. Barack Obama was never responsible for those decisions.Hillary Clinton has never been responsible for those kind of decisions where they were held accountable. I've been held accountable for six years as the governor of New Jersey and with a Democratic legislature, I've gotten conservative things done. That's exactly what I'll do as president of the United States.Senator Paul, you are definitely not in the establishment category But at the beginning of this campaign, you said you were your own man when asked about your father, former Texas Congressman and three-time presidential candidate Ron Paul"
features = vectorizer.get_feature_names()
#vectorized = pd.DataFrame(analyze(new_test),columns=features)
#vectorizer.transform(new_test)
In [501]:
from sklearn import metrics
def state_model(stateFrame):
words_list = wordlist(stateFrame)
word_maxabsscaler(stateFrame,word_finder(AK,0))
X = capitol_words[words_list]
y = capitol_words["R"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=1)
globals()['variable{}'.format(stateFrame)+"_lr"] = LogisticRegression(C=1e9, penalty='l1')
globals()['variable{}'.format(stateFrame)+"_lr"].fit(X_train,y_train)
y_test_pred = globals()['variable{}'.format(stateFrame)+"_lr"].predict(X_test)
# calculate predicted probabilities for class 1
y_pred_prob1 = globals()['variable{}'.format(stateFrame)+"_lr"].predict_proba(X_test)[:, 1]
print "Test set accuracy of LR model: ",metrics.accuracy_score(y_test, y_test_pred)
print "Null accuracy on the test set: ",y_test.mean()
coef = globals()['variable{}'.format(stateFrame)+"_lr"].coef_
globals()['variable{}'.format(stateFrame)+"_coeficient_weight"] = pd.DataFrame({'coeficient':coef[0], 'words':words_list}).sort_values(by='coeficient',ascending=False)
# plots
def metrics_report():
# show predicted probabilities in a histogram
sns.plt.hist(y_pred_prob1)
def ROC_curve(stateFrame):
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_prob1)
sns.plt.plot(fpr, tpr)
sns.plt.xlim([0, 1])
sns.plt.ylim([0, 1.05])
sns.plt.xlabel('False Positive Rate (1 - Specificity)')
sns.plt.ylabel('True Positive Rate (Sensitivity)')
def positive_state_coefs(stateFrame):
return my_mask(globals()['variable{}'.format(stateFrame)+"_coeficient_weight"] ,"coeficient",">=",2)
def positive_state_coefs(stateFrame):
return my_mask(globals()['variable{}'.format(stateFrame)+"_coeficient_weight"] ,"coeficient","<=",-3)
wordlist(AK)
state_model(AK)
In [503]:
def positive_state_coefs(stateFrame):
return my_mask(globals()['variable{}'.format(stateFrame)+"_coeficient_weight"] ,"coeficient",">=",100)
def negative_state_coefs(stateFrame):
return my_mask(globals()['variable{}'.format(stateFrame)+"_coeficient_weight"] ,"coeficient","<=",-50)
In [507]:
print "Republican Words", positive_state_coefs(AK).head(10)
print "PositeWords", negative_state_coefs(AK).head(20)
ROC_curve(AK)
In [508]:
metrics_report()
In [ ]:
print states.keys()
NY = states["NY"]
state_model(NY)
In [ ]:
metrics_report()
In [ ]:
ROC_curve(NY)
positive_state_coefs(NY)
In [ ]:
negative_state_coefs(NY)